import warnings
#warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import matplotlib.gridspec as gridspec
The data science problem that we are trying to solve is predicting whether or not an employee will choose to leave a company based on their profile. This is classification of an employee’s attrition status based on demographic information, work history, and job details. This problem is important because high attrition rates mean companies are losing talented workers and must constantly expend resources to hire and train new employees. Using our model, a company could try to predict whether a given employee has a high chance of wanting to leave the company and incentivize the employee to stay. Additionally, noticing the trends in employee attrition could allow the company to make company-wide changes to decrease the overall attrition rate.
To solve this problem, we’re using a fictional dataset created by IBM data scientists that includes HR information related to an employee’s work life, history, marital status, education, and more. Additionally, each employee has a label corresponding to the employee’s attrition. There are 34 features in this dataset, which include information such as age, gender, marital status, monthly income, and the number of years at the company. Using this dataset, we would like to be able to predict whether an employee will leave the company based on their profile.
Education 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
EnvironmentSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
JobInvolvement 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
JobSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
PerformanceRating 1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'
RelationshipSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
WorkLifeBalance 1 'Bad' 2 'Good' 3 'Better' 4 'Best'
data = pd.read_csv("employee_attrition.csv")
print(data.shape)
data.head()
(1470, 35)
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
We first wanted to see if there were any null values in our dataset, which we checked by finding the number of null values for each feature.
data.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
We saw that our dataset didn't contain any null values. We then used a function to drop duplicate rows from our dataframe, though we saw that there were no duplicate rows since the shape of the dataframe stayed the same.
# Drop duplicates
print("Before drop duplicates: ", data.shape)
data = data.drop_duplicates()
print("After drop duplicates: ", data.shape)
data.head()
Before drop duplicates: (1470, 35) After drop duplicates: (1470, 35)
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
We then separated the column with the attrition status label from the features columns so that we could use this data with the scikit-learn models.
features = data.loc[:, data.columns != 'Attrition']
labels = data['Attrition']
For the final piece of preparation before we explored the data, we decided to remove any columns with a single value. This is because a feature with only one possible value doesn't provide any differentiating information.
drop_f = []
for c in features:
count = features[c].value_counts().shape[0]
if count == 1:
drop_f.append(c)
drop_f
['EmployeeCount', 'Over18', 'StandardHours']
We saw that there were three features that only had one value: EmployeeCount, Over19, StandardHours. We then removed these columns from the dataframe.
# Remove features found in drop_f
features.drop(drop_f, axis=1, inplace=True)
features.head()
/home/daniel/.local/lib/python3.8/site-packages/pandas/core/frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeNumber | EnvironmentSatisfaction | Gender | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 2 | Female | ... | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 2 | 3 | Male | ... | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 4 | 4 | Male | ... | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 5 | 4 | Female | ... | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 7 | 1 | Male | ... | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 31 columns
'EmployeeNumber' also doesn't provide information for the employees because it's a random ID, so we removed this column.
# Remove irrelevant columns
remove_cols = ['EmployeeNumber']
features.drop(remove_cols, axis=1, inplace=True)
features.head()
/home/daniel/.local/lib/python3.8/site-packages/pandas/core/frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 2 | Female | 94 | ... | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | 61 | ... | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 4 | Male | 92 | ... | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | 56 | ... | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | Male | 40 | ... | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 30 columns
To get a broad idea of what the dataset and values for each feature looked like, we took a look at the mean, median, and standard deviation for each feature. We also printed the numbers of records in each class for this dataset so we can check for class imbalance.
# Mean and Std Dev. for all features
print(labels.value_counts())
features.agg(['mean','median','std'])
No 1233 Yes 237 Name: Attrition, dtype: int64
| Age | DailyRate | DistanceFromHome | Education | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | 2.728571 | 6502.931293 | ... | 3.153741 | 2.712245 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| median | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | 3.000000 | 4919.000000 | ... | 3.000000 | 3.000000 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | 1.102846 | 4707.956783 | ... | 0.360824 | 1.081209 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
3 rows × 23 columns
From this, we could see that 84% of the employees in this dataset stayed with the company, resulting in a class imbalance that we would need to address when creating our model.
To get a better picture of the data, we generated histograms for each feature, showing the number of records with each possible feature value.
# Look at histograms
fig, axs = plt.subplots(10, 3, figsize=(40, 40))
axs = axs.flatten()
for i, c in enumerate(features):
sns.histplot(x=c, data=features, ax=axs[i])
plt.show()
We decided to generate a correlation matrix for the features. We thought we'd be able to remove some of the features regarding each employee's rate (DailyRate, HourlyRate, MonthlyRate), since it seemed that these features would contain the same information at different time scales. We also thought there might be correlation between each employee's monthly income and monthly rate. To confirm these suspicions, we wanted to look at the correlations between the different features.
correlations = features.corr(method='pearson')
fig = plt.figure(figsize=(30, 30))
sns.heatmap(correlations, annot=True, cmap='GnBu_r', center=1)
<AxesSubplot:>
After checking the correlation heatmap, we saw that the different rate features were not correlated with each other, nor were they correlated with monthly income. Based on this information, we decided to keep the different rate features.
We also noticed that JobLevel and MonthlyIncome are highly correlated. However, because JobLevel differentiates tiers amongst employees with the same job roles (JobRole), we decided not to drop either feature.
We then wanted to look at the relationship between different feature values and attrition rates. We divided the features up into five feature groups: satisfaction, job specifics, demographics, income, and work history with the current company.
For the satisfaction category, we looked at the following features: WorkLifeBalance, RelationshipSatisfaction, JobSatisfaction, EnvironmentSatisfaction, and OverTime. For each feature, we graphed the number of records with each value by attrition status.
fig, axs = plt.subplots(2, 3, figsize=(30, 12))
axs = axs.flatten()
satisfaction = ["WorkLifeBalance",
"RelationshipSatisfaction",
"JobSatisfaction",
"EnvironmentSatisfaction",
"OverTime"]
for i, f in enumerate(satisfaction):
sns.countplot(x=f, hue='Attrition', data=data, ax=axs[i])
plt.show()
We could see there is a much higher attrition rate in the population of employees who had to work overtime compared to the population of employees without overtime. As expected, we could also see that there are higher rates of attrition in the populations of employees with lower environment satisfaction, worse work-life balance, and lower job satisfaction.
The next category of features we looked at was related to details about each employee's job, which included the following features: JobLevel, JobInvolvement, BusinessTravel, StockOptionLevel, NumCompaniesWorked, Department, and JobRole. We again graphed the number of records with each feature value by attrition status. For NumCompaniesWorked, we generated a Kernel Density Estimate (KDE) plot, since there was a higher range of possible values.
jobSpecifics = [ "JobLevel", "JobInvolvement", "BusinessTravel",
"StockOptionLevel", "NumCompaniesWorked", "Department"]
fig, axs = plt.subplots(2, 3, figsize=(30, 12))
axs = axs.flatten()
for i, f in enumerate(jobSpecifics):
if f == 'NumCompaniesWorked':
sns.kdeplot(x=f, data=data, hue='Attrition', ax=axs[i], fill=True)
else:
sns.countplot(x=f, data=data, hue='Attrition', ax=axs[i])
plt.show()
fig, axs = plt.subplots(1, figsize=(30, 12))
sns.countplot(x='JobRole', data=data, hue='Attrition', ax=axs)
plt.show()
We could see the highest attrition rates in the employee populations with the lowest job level, lowest job involvement, and frequent travel. We could also see high attrition rates in the sales representative role.
The next category of features that we looked at was demographics, which included the MaritalStatus, Gender, EducationField, Education, DistanceFromHome, and Age features. We generated KDE plots for DistanceFromHome and Age, and we plotted the other features by value and attrition status.
demography = ["MaritalStatus",
"Gender",
"EducationField",
"Education",
"DistanceFromHome",
'Age']
fig, axs = plt.subplots(2, 3, figsize=(30, 12))
axs = axs.flatten()
for i, f in enumerate(demography):
if f == 'DistanceFromHome' or f == 'Age':
sns.kdeplot(x=f, data=data, hue='Attrition', ax=axs[i], fill=True)
else:
sns.countplot(x=f, data=data, hue='Attrition',ax=axs[i])
plt.show()
We could see that the population of single employees had a higher rate of attrition compared to married and divorced employees. There was also a higher proportion of employees who left the company in the population that had a farther distance from home compared to the population that was closer.
We then looked at the features related to each employee's work history with the company: TrainingTimesLastYear, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, and YearsWithCurrManager. We did this by creating paired box plots for each feature with TrainingTimesLastYear (since TrainingTimesLastYear is the only categorical feature in this group), separating the plots by attrition status.
# do the same for TrainingTimesLastYear: boxplots
fig, axes = plt.subplots(2, 3, figsize=(30,12))
axes = axes.flatten()
sns.boxplot(x="TrainingTimesLastYear", y="TotalWorkingYears", hue="Attrition",
width=0.6, data=data, ax=axes[0])
# label the countplot with percentages within group
ax = sns.countplot(x="TrainingTimesLastYear", hue="Attrition", fill=True,
data=data, ax=axes[1])
totals = []
for v in set(data['TrainingTimesLastYear']):
t = len(data.loc[data['TrainingTimesLastYear'] == v, :])
totals.append(t)
sns.boxplot(x="TrainingTimesLastYear", y="YearsAtCompany", hue="Attrition",
width=0.6, data=data, ax=axes[2])
sns.boxplot(x="TrainingTimesLastYear", y="YearsInCurrentRole", hue="Attrition",
width=0.6, data=data, ax=axes[3])
sns.boxplot(x="TrainingTimesLastYear", y="YearsSinceLastPromotion", hue="Attrition",
width=0.6, data=data, ax=axes[4])
sns.boxplot(x="TrainingTimesLastYear", y="YearsWithCurrManager", hue="Attrition",
width=0.6, data=data, ax=axes[5])
# add back legend to the countplot at middle
axes[1].legend(title="Attrition")
plt.show()
From these plots, we can see that the population of employees leaving the company tended to have less years in their current role and with their current manager.
The last group of features we looked at was related to income and rates. This included PercentSalaryHike, MonthlyRate, HourlyRate, DailyRate, MonthlyIncome, and PerformanceRating. We created paired box plots for each feature with PerformanceRating (since PerformanceRating is the only categorical feature in this group), separating the plots by attrition status.
income = ["PercentSalaryHike",
"MonthlyRate",
"HourlyRate",
"DailyRate",
"MonthlyIncome",
"PerformanceRating",
"Attrition"]
fig, axes = plt.subplots(2, 3, figsize=(30,12))
axes = axes.flatten()
sns.boxplot(x="PerformanceRating", y="MonthlyRate", hue="Attrition",
width=0.6, data=data, ax=axes[0])
# label the countplot with percentages within group
#
ax = sns.countplot(x="PerformanceRating", hue="Attrition", fill=True,
data=data, ax=axes[1])
totalThree = len(data.loc[data['PerformanceRating'] == 3, :])
totalFour = len(data.loc[data['PerformanceRating'] == 4, :])
for p, t in zip(ax.patches, [totalThree, totalFour, totalThree, totalFour]):
print(p, t)
percentage = '{:.2f}%'.format(100 * p.get_height()/t)
x = p.get_x() + p.get_width()/2 - 0.125
y = p.get_y() + p.get_height() + 5
ax.annotate(percentage, (x, y))
sns.boxplot(x="PerformanceRating", y="HourlyRate", hue="Attrition",
width=0.6, data=data, ax=axes[2])
sns.boxplot(x="PerformanceRating", y="DailyRate", hue="Attrition",
width=0.6, data=data, ax=axes[3])
sns.boxplot(x="PerformanceRating", y="MonthlyIncome", hue="Attrition",
width=0.6, data=data, ax=axes[4])
sns.boxplot(x="PerformanceRating", y="PercentSalaryHike", hue="Attrition",
width=0.6, data=data, ax=axes[5])
# remove all legends
for ax in axes:
ax.legend([],[], frameon=False)
# add back legend to the countplot at middle
axes[1].legend(title="Attrition")
plt.show()
income.remove("PerformanceRating")
pair = sns.pairplot(data[income], diag_kind='kde', hue='Attrition')
pair.fig.set_size_inches(30, 12)
Rectangle(xy=(-0.4, 0), width=0.4, height=200, angle=0) 1244 Rectangle(xy=(0.6, 0), width=0.4, height=37, angle=0) 226 Rectangle(xy=(-2.77556e-17, 0), width=0.4, height=1044, angle=0) 1244 Rectangle(xy=(1, 0), width=0.4, height=189, angle=0) 226
The boxplots show that the income features with respect to 'PerformanceRating' show no outliers with the exception of 'MonthlyIncome' where outliers are present due to income gaps between employees. We show the scatter plots to see if there are any outliers with the data in terms of clustering. From visual inspection, these scatter plots tell us that there are no outliers in the data for the corresponding features compared in the graphs.
We first one-hot encoded the non-numeric features, which included the following features: BusinessTravel, Department, EducationField, JobRole, and MaritalStatus.
from sklearn.preprocessing import LabelEncoder
nnum_features = features.select_dtypes(include=[object])
print("Features of type object: ", nnum_features.columns)
le = LabelEncoder()
enc_nnum_features = nnum_features.apply(le.fit_transform)
enc_nnum_features = enc_nnum_features[['Gender', 'OverTime']]
cols = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
dummies = pd.get_dummies(features[cols], prefix=cols)
# combine encoded dataframe with original
cat1 = features.select_dtypes(exclude=[object])
print("Before one-hot encoded features: ", features.shape)
u_features = pd.concat([cat1, enc_nnum_features, dummies], axis=1)
print("After one-hot encoded features: ", features.shape)
u_features.info()
Features of type object: Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
'MaritalStatus', 'OverTime'],
dtype='object')
Before one-hot encoded features: (1470, 30)
After one-hot encoded features: (1470, 30)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470 entries, 0 to 1469
Data columns (total 49 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 1470 non-null int64
1 DailyRate 1470 non-null int64
2 DistanceFromHome 1470 non-null int64
3 Education 1470 non-null int64
4 EnvironmentSatisfaction 1470 non-null int64
5 HourlyRate 1470 non-null int64
6 JobInvolvement 1470 non-null int64
7 JobLevel 1470 non-null int64
8 JobSatisfaction 1470 non-null int64
9 MonthlyIncome 1470 non-null int64
10 MonthlyRate 1470 non-null int64
11 NumCompaniesWorked 1470 non-null int64
12 PercentSalaryHike 1470 non-null int64
13 PerformanceRating 1470 non-null int64
14 RelationshipSatisfaction 1470 non-null int64
15 StockOptionLevel 1470 non-null int64
16 TotalWorkingYears 1470 non-null int64
17 TrainingTimesLastYear 1470 non-null int64
18 WorkLifeBalance 1470 non-null int64
19 YearsAtCompany 1470 non-null int64
20 YearsInCurrentRole 1470 non-null int64
21 YearsSinceLastPromotion 1470 non-null int64
22 YearsWithCurrManager 1470 non-null int64
23 Gender 1470 non-null int64
24 OverTime 1470 non-null int64
25 BusinessTravel_Non-Travel 1470 non-null uint8
26 BusinessTravel_Travel_Frequently 1470 non-null uint8
27 BusinessTravel_Travel_Rarely 1470 non-null uint8
28 Department_Human Resources 1470 non-null uint8
29 Department_Research & Development 1470 non-null uint8
30 Department_Sales 1470 non-null uint8
31 EducationField_Human Resources 1470 non-null uint8
32 EducationField_Life Sciences 1470 non-null uint8
33 EducationField_Marketing 1470 non-null uint8
34 EducationField_Medical 1470 non-null uint8
35 EducationField_Other 1470 non-null uint8
36 EducationField_Technical Degree 1470 non-null uint8
37 JobRole_Healthcare Representative 1470 non-null uint8
38 JobRole_Human Resources 1470 non-null uint8
39 JobRole_Laboratory Technician 1470 non-null uint8
40 JobRole_Manager 1470 non-null uint8
41 JobRole_Manufacturing Director 1470 non-null uint8
42 JobRole_Research Director 1470 non-null uint8
43 JobRole_Research Scientist 1470 non-null uint8
44 JobRole_Sales Executive 1470 non-null uint8
45 JobRole_Sales Representative 1470 non-null uint8
46 MaritalStatus_Divorced 1470 non-null uint8
47 MaritalStatus_Married 1470 non-null uint8
48 MaritalStatus_Single 1470 non-null uint8
dtypes: int64(25), uint8(24)
memory usage: 365.3 KB
We decided to agglomerate the three satisfaction features (EnvironmentSatisfaction, JobSatisfaction, RelationshipSatisfaction) into one overall satisfaction index. We did this by replacing the three columns with a new column that contained the sum of those columns. We did this because it reduced the number of features in our dataset while still giving us a measure for each employee's overall satisfaction.
# create overall satisfaction index
def calc_satisfaction(row):
return row['JobSatisfaction'] + row['EnvironmentSatisfaction'] + row['RelationshipSatisfaction']
u_features['OverallSatisfaction'] = u_features.apply (lambda row: calc_satisfaction(row), axis=1)
u_features.head()
remove_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction']
u_features.drop(remove_cols, axis=1, inplace=True)
u_features.head()
| Age | DailyRate | DistanceFromHome | Education | HourlyRate | JobInvolvement | JobLevel | MonthlyIncome | MonthlyRate | NumCompaniesWorked | ... | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | OverallSatisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1102 | 1 | 2 | 94 | 3 | 2 | 5993 | 19479 | 8 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 7 |
| 1 | 49 | 279 | 8 | 1 | 61 | 2 | 2 | 5130 | 24907 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 9 |
| 2 | 37 | 1373 | 2 | 2 | 92 | 2 | 1 | 2090 | 2396 | 6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 9 |
| 3 | 33 | 1392 | 3 | 4 | 56 | 3 | 1 | 2909 | 23159 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 |
| 4 | 27 | 591 | 2 | 1 | 40 | 3 | 1 | 3468 | 16632 | 9 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 7 |
5 rows × 47 columns
We binned certain features to reduce runtime for the models. 'Age' was binned because employees in certain age groups are likely to share similar traits. We binned employees by age into the following age groups: '18-24’, ‘25-34’, ‘35-45’, ‘45-55’, ‘56+’.
The rate and income features were binned since after analysis these features provided little information to justify a higher resolution.
# Bin features
bin_values = ['Age', 'DailyRate', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate']
bin_features = u_features.copy()
for b in bin_values:
min = u_features[b].min()
max = u_features[b].max()
# print(min, max)
width = (max-min)/5
bins = [-np.inf, min+width, min+width*2, min+width*3, min+width*4, np.inf]
bin_features[b] = np.digitize(u_features[b], bins, right=True)
bin_features.head()
| Age | DailyRate | DistanceFromHome | Education | HourlyRate | JobInvolvement | JobLevel | MonthlyIncome | MonthlyRate | NumCompaniesWorked | ... | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | OverallSatisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 4 | 1 | 2 | 5 | 3 | 2 | 2 | 4 | 8 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 7 |
| 1 | 4 | 1 | 8 | 1 | 3 | 2 | 2 | 2 | 5 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 9 |
| 2 | 3 | 5 | 2 | 2 | 5 | 2 | 1 | 1 | 1 | 6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 9 |
| 3 | 2 | 5 | 3 | 4 | 2 | 3 | 1 | 1 | 5 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 |
| 4 | 2 | 2 | 2 | 1 | 1 | 3 | 1 | 1 | 3 | 9 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 7 |
5 rows × 47 columns
We chose to test the following classification models:
During our data exploration, we noticed there was a class imbalance. While there were 1,233 records for employees that did not leave the company, only 237 employees left the company. To address this imbalance, we also used the Synthetic Minority Oversampling Technique (SMOTE) provided by the imblearn package, which creates pseudo employees records, with each model and evaluated the changes in accuracy, recall, and precision.
We tested with Naive Bayes since it doesn't suffer from the curse of high dimensionality.
model = imPipeline([
('sampling', SMOTE()),
('classification', GaussianNB())
])
bayes = GaussianNB()
bayes_pred1 = cross_val_predict(bayes, bin_features, labels, cv=10, n_jobs=-1, method='predict_proba', verbose=1)
print("Naive Bayes without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in bayes_pred1[:, 1] ]
print(sk.metrics.classification_report(labels, pred))
bayes_pred2 = cross_val_predict(model, bin_features, labels, cv=10, n_jobs=-1, method='predict_proba', verbose=1)
print("Naive Bayes with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in bayes_pred2[:, 1] ]
print(sk.metrics.classification_report(labels, pred))
Naive Bayes without SMOTE
precision recall f1-score support
No 0.93 0.65 0.76 1233
Yes 0.29 0.75 0.42 237
accuracy 0.66 1470
macro avg 0.61 0.70 0.59 1470
weighted avg 0.83 0.66 0.71 1470
Naive Bayes with SMOTE
precision recall f1-score support
No 0.90 0.64 0.75 1233
Yes 0.26 0.64 0.37 237
accuracy 0.64 1470
macro avg 0.58 0.64 0.56 1470
weighted avg 0.80 0.64 0.69 1470
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 0.0s remaining: 0.1s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 0.0s remaining: 0.2s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 0.1s finished
Overall, Naive Bayes showed difficulties in the recall score for employees who did not leave as well as for the precision score for those who did leave IBM, leading to a low overall accuracy for the dataset.
We tested the dataset with a K-Nearest-Neighbors (KNN) classifier here.
# Perform KNN
scaler = StandardScaler()
pca = PCA()
knn = KNeighborsClassifier()
pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])
param_grid = {
'pca__n_components': list(range(10, 31, 5)),
'knn__n_neighbors': list(range(5, 36, 5))
}
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
knn_pred1 = cross_val_predict(grid, bin_features, labels, cv=10, n_jobs=-1, method='predict_proba', verbose=1)
print("KNN without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in knn_pred1[:, 1]]
print(sk.metrics.classification_report(labels, pred))
grid.fit(bin_features, labels)
print(grid.best_params_)
pipeline = imPipeline(steps=[('scaler', scaler), ('smote', SMOTE()), ('pca', pca), ('knn', knn)])
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
knn_pred2 = cross_val_predict(grid, bin_features, labels, cv=10, n_jobs=-1, method='predict_proba', verbose=1)
print("KNN with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in knn_pred2[:, 1]]
print(sk.metrics.classification_report(labels, pred))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 6.0s remaining: 23.8s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 6.3s finished
KNN without SMOTE
precision recall f1-score support
No 0.86 0.99 0.92 1233
Yes 0.70 0.16 0.27 237
accuracy 0.85 1470
macro avg 0.78 0.58 0.59 1470
weighted avg 0.83 0.85 0.81 1470
{'knn__n_neighbors': 15, 'pca__n_components': 25}
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 7.4s remaining: 29.7s
KNN with SMOTE
precision recall f1-score support
No 0.90 0.67 0.77 1233
Yes 0.26 0.60 0.36 237
accuracy 0.66 1470
macro avg 0.58 0.64 0.57 1470
weighted avg 0.79 0.66 0.70 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 7.7s finished
KNN performed well with the data with respect to the precision and recall for employees who did not leave IBM. For employees who did leave, the precision score was much less than the same score for those who did not leave, and the recall score for employees who left was also very low, making this classifier bad at determining those who won't actually leave the company (Type II error).
We test the dataset using a Multi-layer Perceptron (MLP) Classifier here. We decided to test this model since MLPs are capable of learning complex and diverse decision boundaries from the data.
scaler = StandardScaler(with_mean=False, with_std=True)
nn = MLPClassifier()
param_grid = {
'nn__hidden_layer_sizes': [(x, ) for x in range(60, 101, 5)],
'nn__activation': ['logistic', 'tanh', 'relu'],
'nn__learning_rate_init': [.001, .003, .005, .007],
'nn__learning_rate': ['constant', 'invscaling', 'adaptive']
}
pipeline = Pipeline(steps=[('scaler', scaler), ('nn', nn)])
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
nn_pred1 = cross_val_predict(grid, bin_features, labels, cv=10, n_jobs=-1, verbose=1, method='predict_proba')
print("MLP without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in nn_pred1[:, 1]]
print(sk.metrics.classification_report(labels, pred))
grid.fit(bin_features, labels)
print(grid.best_params_)
pipeline = imPipeline(steps=[('scaler', scaler), ('smote', SMOTE()), ('nn', nn)])
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
nn_pred2 = cross_val_predict(grid, bin_features, labels, cv=10, n_jobs=-1, verbose=1, method='predict_proba')
print("MLP with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in nn_pred2[:, 1]]
print(sk.metrics.classification_report(labels, pred))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 38.5min remaining: 153.9min
MLP without SMOTE
precision recall f1-score support
No 0.90 0.96 0.93 1233
Yes 0.70 0.45 0.55 237
accuracy 0.88 1470
macro avg 0.80 0.71 0.74 1470
weighted avg 0.87 0.88 0.87 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 38.7min finished
/home/daniel/.local/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
warnings.warn(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
{'nn__activation': 'logistic', 'nn__hidden_layer_sizes': (95,), 'nn__learning_rate': 'adaptive', 'nn__learning_rate_init': 0.001}
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 53.1min remaining: 212.5min
MLP with SMOTE
precision recall f1-score support
No 0.90 0.91 0.90 1233
Yes 0.50 0.48 0.49 237
accuracy 0.84 1470
macro avg 0.70 0.69 0.70 1470
weighted avg 0.84 0.84 0.84 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 53.8min finished
As expected, the MLP classifier obtains a high score for the precision and recall of those employees who leave IBM. However, just like most of the classfiers we will look at, they suffer from a poor performance relative to the recall score for employees who do leave IBM.
We tested the dataset using a Support Vector Machine (SVM) Classifier here. We expected it to perform well on our dataset compared to classfiers such as Naive Bayes and KNN due to SVM's effectiveness with high dimensional datasets.
scaler = StandardScaler()
pca = PCA()
svm = SVC(probability=True)
param_grid = {
'pca__n_components': list(range(15, 45)),
'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
'svm__C': [0.1, 0.5, 1, 2]
}
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('svm', svm)])
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_pred1 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', verbose=1, n_jobs=-1)
print("SVM without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in svm_pred1[:, 1]]
print(sk.metrics.classification_report(labels, pred))
grid.fit(bin_features, labels)
print(grid.best_params_)
pipe = imPipeline(steps=[('smote', SMOTE()), ('scaler', scaler), ('pca', pca), ('svm', svm)])
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_pred2 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', verbose=1, n_jobs=-1)
print("SVM with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in svm_pred2[:, 1]]
print(sk.metrics.classification_report(labels, pred))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 1.3min remaining: 5.0min
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.3min finished
SVM without SMOTE
precision recall f1-score support
No 0.88 0.98 0.93 1233
Yes 0.80 0.32 0.45 237
accuracy 0.88 1470
macro avg 0.84 0.65 0.69 1470
weighted avg 0.87 0.88 0.85 1470
{'pca__n_components': 29, 'svm__C': 0.5, 'svm__kernel': 'linear'}
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 3.2min remaining: 12.7min
SVM with SMOTE
precision recall f1-score support
No 0.88 0.94 0.91 1233
Yes 0.53 0.32 0.40 237
accuracy 0.84 1470
macro avg 0.71 0.63 0.66 1470
weighted avg 0.82 0.84 0.83 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 3.2min finished
The SVM classifier obtained a high precision and recall score for employees who did not leave IBM as well as a high precision for those who did leave. However, like many of the models tested, it did poorly in determining who will actually leave IBM and had a low recall score or high Type II error for those who will actually leave IBM.
SGDClassifier is a generalized linear classifier that will use Stochastic Gradient Descent as a solver. With SGDClassifier you can use lots of different loss functions that allows you to tune your model and find the best model for your data. Since SVM performed well, we believed SGDClassifier would also perform similarly.
scaler = StandardScaler()
sgd = SGDClassifier()
param_grid = {
'sgd__loss': ['log', 'modified_huber', 'squared_hinge', 'perceptron'],
'sgd__penalty': ['l2', 'l1', 'elasticnet'],
'sgd__class_weight': [None, 'balanced'],
'sgd__max_iter': [500, 1000, 2000],
'sgd__early_stopping': [True, False],
'sgd__alpha': [.0001, .001, .01, .1],
}
pipe = Pipeline(steps = [('scaler', scaler), ('sgd', sgd)])
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
sgd_pred1 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', n_jobs=-1, verbose=1)
print("SGD without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in sgd_pred1[:, 1]]
print(sk.metrics.classification_report(labels, pred))
grid.fit(bin_features, labels)
print(grid.best_params_)
pipe = imPipeline(steps = [('smote', SMOTE()), ('scaler', scaler), ('sgd', sgd)])
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
sgd_pred2 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', n_jobs=-1, verbose=1)
print("SGD with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in sgd_pred2[:, 1]]
print(sk.metrics.classification_report(labels, pred))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 1.1min remaining: 4.2min
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.1min finished
SGD without SMOTE
precision recall f1-score support
No 0.89 0.98 0.93 1233
Yes 0.76 0.35 0.48 237
accuracy 0.88 1470
macro avg 0.82 0.66 0.71 1470
weighted avg 0.87 0.88 0.86 1470
{'sgd__alpha': 0.01, 'sgd__class_weight': None, 'sgd__early_stopping': False, 'sgd__loss': 'log', 'sgd__max_iter': 1000, 'sgd__penalty': 'elasticnet'}
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 2.3min remaining: 9.1min
SGD with SMOTE
precision recall f1-score support
No 0.89 0.96 0.93 1233
Yes 0.67 0.38 0.49 237
accuracy 0.87 1470
macro avg 0.78 0.67 0.71 1470
weighted avg 0.85 0.87 0.85 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 2.3min finished
SGDClassifier's results place it as another top performing model based on it's precision and recall score for those employees who do not leave IBM, but still performs poorly in terms of identifying those employees who actually leave IBM based on those records' recall and precision score.
We tested the data with a DecisionTree Classifier since it is one of the most interpretable models for classification.
param_grid = {
'max_depth': list(range(35, 55, 1)),
'min_samples_leaf': list(range(1, 45, 1)),
'max_features': ['sqrt', 'log2']
}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring="accuracy", cv=5, n_jobs=-1)
dt_pred1 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', n_jobs=-1, verbose=1)
print("DecisionTree without SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in dt_pred1[:, 1]]
print(sk.metrics.classification_report(labels, pred))
grid.fit(bin_features, labels)
print(grid.best_params_)
param_grid = {
'dt__max_depth': list(range(35, 55, 1)),
'dt__min_samples_leaf': list(range(1, 45, 1)),
'dt__max_features': ['sqrt', 'log2']
}
pipe = imPipeline(steps=[('smote', SMOTE()), ('dt', DecisionTreeClassifier())])
grid = GridSearchCV(pipe, param_grid, scoring="accuracy", cv=5, n_jobs=-1)
dt_pred2 = cross_val_predict(grid, bin_features, labels, cv=10, method='predict_proba', n_jobs=-1, verbose=1)
print("DecisionTree with SMOTE")
pred = ["Yes" if v >= 0.5 else "No" for v in dt_pred2[:, 1]]
print(sk.metrics.classification_report(labels, pred))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 1.2min remaining: 5.0min
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.3min finished
DecisionTree without SMOTE
precision recall f1-score support
No 0.86 0.96 0.90 1233
Yes 0.43 0.16 0.23 237
accuracy 0.83 1470
macro avg 0.64 0.56 0.57 1470
weighted avg 0.79 0.83 0.80 1470
{'max_depth': 44, 'max_features': 'sqrt', 'min_samples_leaf': 32}
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 2 out of 10 | elapsed: 4.6min remaining: 18.5min
DecisionTree with SMOTE
precision recall f1-score support
No 0.87 0.79 0.83 1233
Yes 0.26 0.40 0.32 237
accuracy 0.73 1470
macro avg 0.57 0.59 0.57 1470
weighted avg 0.77 0.73 0.75 1470
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 4.7min finished
To preface, saying no all the time for the dataset would obtain a score of 83.8%. With this knowledge in mind, the DecisionTree Classifier has poor performance as it doesn't compensate well for the 'Yes' class. This could be due to the data having few features with strong distinctions between those who leave IBM and those who don't, making it hard for the Decision Tree to determing good features to create splits on.
To understand why the Decision Tree Classifier performs poorly, we wrote the following code block to visualize the classifier.
param_grid = {
'max_depth': list(range(35, 56, 1)),
'min_samples_leaf': list(range(1, 45, 1)),
'max_features': ['sqrt', 'log2']
}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring="accuracy", cv=5, n_jobs=-1)
grid.fit(bin_features, labels)
print(grid.best_params_)
keys = grid.best_params_.keys()
params = {}
for k in keys:
params[k[:]] = grid.best_params_[k]
estimator = DecisionTreeClassifier(**params)
estimator.fit(bin_features, labels)
# Look at decision tree
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = bin_features.columns.to_list(),
class_names = ["Attrited", "Not Attrited"],
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
display(Image(filename = 'tree.png'))
{'max_depth': 48, 'max_features': 'sqrt', 'min_samples_leaf': 26}